{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 对抗性验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°1\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's auc: 0.987009\tvalid_1's auc: 0.984863\n",
      "[200]\ttraining's auc: 0.987441\tvalid_1's auc: 0.984977\n",
      "Early stopping, best iteration is:\n",
      "[110]\ttraining's auc: 0.987545\tvalid_1's auc: 0.985349\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's auc: 0.988204\tvalid_1's auc: 0.990867\n",
      "[200]\ttraining's auc: 0.988257\tvalid_1's auc: 0.991037\n",
      "[300]\ttraining's auc: 0.992942\tvalid_1's auc: 0.99413\n",
      "[400]\ttraining's auc: 0.993155\tvalid_1's auc: 0.994297\n",
      "[500]\ttraining's auc: 0.993569\tvalid_1's auc: 0.994608\n",
      "[600]\ttraining's auc: 0.993737\tvalid_1's auc: 0.994731\n",
      "[700]\ttraining's auc: 0.996043\tvalid_1's auc: 0.996612\n",
      "[800]\ttraining's auc: 0.996876\tvalid_1's auc: 0.997133\n",
      "[900]\ttraining's auc: 0.997153\tvalid_1's auc: 0.997388\n",
      "[1000]\ttraining's auc: 0.997433\tvalid_1's auc: 0.99767\n",
      "Did not meet early stopping. Best iteration is:\n",
      "[998]\ttraining's auc: 0.997434\tvalid_1's auc: 0.99767\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's auc: 0.988624\tvalid_1's auc: 0.987713\n",
      "[200]\ttraining's auc: 0.991218\tvalid_1's auc: 0.991457\n",
      "[300]\ttraining's auc: 0.99143\tvalid_1's auc: 0.991794\n",
      "[400]\ttraining's auc: 0.993926\tvalid_1's auc: 0.993867\n",
      "Early stopping, best iteration is:\n",
      "[381]\ttraining's auc: 0.993918\tvalid_1's auc: 0.994049\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's auc: 0.990199\tvalid_1's auc: 0.988785\n",
      "[200]\ttraining's auc: 0.992542\tvalid_1's auc: 0.992246\n",
      "[300]\ttraining's auc: 0.992537\tvalid_1's auc: 0.992022\n",
      "Early stopping, best iteration is:\n",
      "[234]\ttraining's auc: 0.992697\tvalid_1's auc: 0.992387\n",
      "fold n°5\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's auc: 0.984891\tvalid_1's auc: 0.985196\n",
      "Early stopping, best iteration is:\n",
      "[82]\ttraining's auc: 0.985208\tvalid_1's auc: 0.985553\n",
      "finished!\n"
     ]
    }
   ],
   "source": [
    "df_train = data_train.copy()\n",
    "df_test = data_test.copy()\n",
    "df_train['target'] = 0\n",
    "df_test['target'] = 1\n",
    "train_test = pd.concat([df_train, df_test], axis =0)\n",
    "target = train_test['target'].values\n",
    "param = {\n",
    "        'num_leaves': 50,\n",
    "        'objective':'binary',\n",
    "        'max_depth': 5,\n",
    "        'learning_rate': 0.001,\n",
    "        \"boosting\": \"gbdt\",\n",
    "        \"metric\": 'auc',\n",
    "        }\n",
    "\n",
    "folds = KFold(n_splits=5, shuffle=True, random_state=15)\n",
    "oof = np.zeros(len(train_test))\n",
    "\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_test.values, target)):\n",
    "    print(\"fold n°{}\".format(fold_+1))\n",
    "    trn_data = lgb.Dataset(train_test.iloc[trn_idx][feats], label=target[trn_idx])\n",
    "    val_data = lgb.Dataset(train_test.iloc[val_idx][feats], label=target[val_idx])\n",
    "\n",
    "    num_round = 1000\n",
    "    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)\n",
    "    oof[val_idx] = clf.predict(train_test.iloc[val_idx][feats], num_iteration=clf.best_iteration)\n",
    "print(\"finished!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    12024\n",
      "1     2841\n",
      "Name: target, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "threshold = 0.374\n",
    "train_test['adver'] = oof\n",
    "train_test['target'][train_test['adver']>threshold] = 1\n",
    "train_test['target'][train_test['adver']<=threshold] = 0\n",
    "data_train['target'] = train_test[:data_train.shape[0]]['target']\n",
    "print(data_train['target'].value_counts())\n",
    "data_train[['id', 'target']].to_csv('work/features/adversialdata.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's binary_logloss: 0.0216459\ttraining's f1: 0.865904\tvalid_1's binary_logloss: 0.0373368\tvalid_1's f1: 0.770747\n",
      "Early stopping, best iteration is:\n",
      "[29]\ttraining's binary_logloss: 0.0390304\ttraining's f1: 0.792353\tvalid_1's binary_logloss: 0.0470183\tvalid_1's f1: 0.828311\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's binary_logloss: 0.0238054\ttraining's f1: 0.870701\tvalid_1's binary_logloss: 0.0291009\tvalid_1's f1: 0.725806\n",
      "Early stopping, best iteration is:\n",
      "[24]\ttraining's binary_logloss: 0.043902\ttraining's f1: 0.761243\tvalid_1's binary_logloss: 0.0438345\tvalid_1's f1: 0.792639\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[100]\ttraining's binary_logloss: 0.0229815\ttraining's f1: 0.872089\tvalid_1's binary_logloss: 0.0342345\tvalid_1's f1: 0.69528\n",
      "Early stopping, best iteration is:\n",
      "[25]\ttraining's binary_logloss: 0.0423943\ttraining's f1: 0.812501\tvalid_1's binary_logloss: 0.0459671\tvalid_1's f1: 0.785924\n",
      "fold n°3\n"
     ]
    }
   ],
   "source": [
    "adver = pd.read_csv('work/features/adversialdata.csv')\n",
    "data_train.drop(['target'],inplace=True,axis=1)\n",
    "data_train = data_train.merge(adver, how='inner', on='id')\n",
    "\n",
    "fs=feature_importance_df[feature_importance_df>0].index\n",
    "train_df = data_train[data_train['target']==0]\n",
    "test_df = data_train[data_train['target']==1]\n",
    "test_df.drop(['target'], inplace=True, axis=1)\n",
    "df_train = train_df.copy()\n",
    "df_test = test_df.copy()\n",
    "\n",
    "target = df_train['label'].values\n",
    "\n",
    "param = lgb_params\n",
    "\n",
    "folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)\n",
    "oof = np.zeros(len(test_df))\n",
    "\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    trn_data = lgb.Dataset(df_train.iloc[trn_idx][feats], label=target[trn_idx])\n",
    "    val_data = lgb.Dataset(df_train.iloc[val_idx][feats], label=target[val_idx])\n",
    "\n",
    "    num_round = 10000\n",
    "    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100, feval=lgb_f1_score)\n",
    "    oof += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/5\n",
    "\n",
    "valid_f1 = f1_score(test_df['label'],  (oof>0.5).astype(int))\n",
    "valid_p = precision_score(test_df['label'],  (oof>0.5).astype(int))\n",
    "valid_r = recall_score(test_df['label'],  (oof>0.5).astype(int))\n",
    "valid_auc = roc_auc_score(test_df['label'],  (oof>0.5).astype(int))\n",
    "F = valid_p*0.7+valid_r*0.2+valid_f1*0.1\n",
    "print(\"对抗性验证加权分数：\", F)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 对抗性验证、cv验证对比\n",
    "\n",
    "| cv分数 | 对抗性验证分数 | 线上分数 | 模型 |\n",
    "| -------- | -------- | -------- | -------- |\n",
    "| 0.8444226408786157| 0.8310498322769563|0.84242879838| lgb|\n",
    "|0.8450666512266842  0.8525048299388855|0.8327748703093462|0.84119259714|geo\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|0.8450666512266842  0.8525048299388855|0.8327748703093462||\n",
      "lgb mean: 0.09842441825411784  count1: 926  count0: 9074\n",
      "xgb mean: 0.24873151432462037  count1: 938  count0: 9062\n",
      "geo mean: 0.20337410213048607  count1: 935  count0: 9065\n"
     ]
    }
   ],
   "source": [
    "print(\"|{}  {}|{}||\".format(cv_score_lgb,cv_score_xgb, F))\n",
    "printresult(lgb_test, 'lgb')\n",
    "printresult(xgb_test, 'xgb')\n",
    "printresult(geo_test, 'geo')\n",
    "submitresult(geo_test, '83277geo_11_16')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 贝叶斯调参"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "def search_lgb(num_leaves,\n",
    "                 max_depth,\n",
    "                 max_bin,\n",
    "                 bagging_fraction,\n",
    "                 bagging_freq,\n",
    "                 feature_fraction,\n",
    "                 min_split_gain,\n",
    "                 min_child_samples,\n",
    "                 min_child_weight,\n",
    "                 lambda_l2,\n",
    "                 lambda_l1,\n",
    "                 learning_rate,\n",
    "                 min_data_in_leaf):\n",
    "    offline_score = []\n",
    "    train_x = data_train[feats]\n",
    "    test_x = data_test[feats]\n",
    "    train_y = data_train['label']\n",
    "    folds = 5\n",
    "    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=2020)\n",
    "\n",
    "    train = np.zeros(train_x.shape[0])\n",
    "    test = np.zeros(test_x.shape[0])\n",
    "    \n",
    "    num_leaves = int(num_leaves)\n",
    "    max_depth = int(max_depth)\n",
    "    max_bin = int(max_bin)\n",
    "    bagging_freq = int(bagging_freq)\n",
    "    min_child_samples = int(min_child_samples)\n",
    "    min_data_in_leaf = int(min_data_in_leaf)\n",
    "\n",
    "    cv_scores = []\n",
    "    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):\n",
    "        #print('************************************ {} ************************************'.format(str(i+1)))\n",
    "        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]\n",
    "\n",
    "        train_matrix = lgb.Dataset(trn_x, label=trn_y)\n",
    "        valid_matrix = lgb.Dataset(val_x, label=val_y)\n",
    "    \n",
    "        params = {\n",
    "            'learning_rate': learning_rate,\n",
    "            'boosting_type': 'gbdt',\n",
    "            'objective': 'binary',\n",
    "            'num_leaves': num_leaves,\n",
    "            'max_depth':max_depth,\n",
    "            'bagging_fraction': bagging_fraction,\n",
    "            'feature_fraction':feature_fraction,\n",
    "            'bagging_seed':2020,\n",
    "            'min_data_in_leaf':min_data_in_leaf,\n",
    "            'tree_learner':'voting',\n",
    "            'reg_lambda':8,\n",
    "            'verbose': -1,\n",
    "            'nthread': 8,\n",
    "            'colsample_bytree':0.77,\n",
    "            'min_child_weight':min_child_weight,\n",
    "            'min_child_samples':min_child_samples,\n",
    "            'min_split_gain':min_split_gain,\n",
    "            'lambda_l1': lambda_l1,\n",
    "            'max_bin':max_bin,\n",
    "            'bagging_freq':bagging_freq,\n",
    "            'lambda_l2':lambda_l2,\n",
    "        }\n",
    "\n",
    "        model = lgb.train(params, train_matrix, 1000, valid_sets=[valid_matrix], verbose_eval=100, early_stopping_rounds=200,feval=lgb_f1_score)\n",
    "        val_pred = model.predict(val_x, num_iteration=model.best_iteration)\n",
    "        offline_score.append(model.best_score['valid_0']['f1'])\n",
    "    print(offline_score)\n",
    "    return  sum(offline_score)/folds\n",
    "from bayes_opt import BayesianOptimization\n",
    "pbounds = {'num_leaves': (5,100),\n",
    "    'min_data_in_leaf': (5, 150),\n",
    "    'learning_rate': (0.01, 0.3),\n",
    "    'lambda_l1': (0, 5), \n",
    "    'max_depth': (6, 12), \n",
    "    'max_bin': (30, 80), \n",
    "    'bagging_fraction': (0.6, 1.0), \n",
    "    'bagging_freq': (1, 50), \n",
    "    'feature_fraction': (0.5, 0.8), \n",
    "    'min_split_gain': (0.0, 1.0), \n",
    "    'min_child_samples': (25, 125), \n",
    "    'min_child_weight': (0.0, 1.0), \n",
    "    'lambda_l2': (0.0, 3.0)}\n",
    "\n",
    "optimizer = BayesianOptimization(\n",
    "    f=search_lgb,\n",
    "    pbounds=pbounds,\n",
    "    random_state=2020,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best  parameters:  {'bagging_fraction': 0.7916645558661418, 'bagging_freq': 29.482656152828152, 'feature_fraction': 0.5033671242674949, 'lambda_l1': 0.016654870990415738, 'lambda_l2': 0.49316656822413263, 'learning_rate': 0.039952637233412566, 'max_bin': 30.853314762298357, 'max_depth': 8.896569149491167, 'min_child_samples': 39.48193081908278, 'min_child_weight': 0.4295299343618967, 'min_data_in_leaf': 33.613642736201086, 'min_split_gain': 0.3267549940668256, 'num_leaves': 99.73105572529168}\r"
     ]
    }
   ],
   "source": [
    "optimizer.maximize(\n",
    "    init_points = 5,\n",
    "    n_iter = 800\n",
    ")\n",
    "print('-'*130)\n",
    "print('Final Results')\n",
    "print('Maximum  value: %f' % optimizer.max['target'])\n",
    "print('Best  parameters: ', optimizer.max['params'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Maximum  value: 0.844252"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'bagging_fraction': 0.8005358711303714, 'bagging_freq': 1.8769441721990012, 'feature_fraction': 0.5504779502843268, 'lambda_l1': 2.5475225198526372, 'lambda_l2': 1.4960494419050065, 'learning_rate': 0.23623233221859305, 'max_bin': 30.558404027838126, 'max_depth': 11.705618442039626, 'min_child_samples': 41.248916750345366, 'min_child_weight': 0.9993336991412968, 'min_data_in_leaf': 13.73766196726757, 'min_split_gain': 0.01810624502124203, 'num_leaves': 96.6614718728411"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "huawei_copy",
   "language": "python",
   "name": "huawei_copy"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}